* NOTE: if Stata does not do so automatically, cd to the top level directory of the country-level raw data folders, i.e., the "data" directory
clear all
tempfile participations timestamps shortIDs translations endtime sequence docviews RAcoding

* the frame and program help collecting p-values before various cleaning operations to check they don't affect results
frame create pvalues str20(after test) double p

program collect_p
	qui tab guilty precedent, exact
	frame post pvalues ("`1'") ("precedent") (r(p_exact))
	qui tab guilty def_nat, exact
	frame post pvalues ("`1'") ("nationality") (r(p_exact))
end

/********************************************************************
	I. INGESTING XLSX & CSV FILES (other than RA coding of reasons)
*********************************************************************/

*** Main data (participations)

save `participations', emptyok
foreach country in Argentina Brazil China France Germany India USA {

	import delimited `country'/participations_timestamps.csv, varnames(1) clear // csv doesn't work for text-->Stata (paragraphs marks etc.), but Excel messes up dates ..
	rename random randomID
	save `timestamps', replace

	import excel using `country'/participations.xlsx, firstrow allstring clear
	if "`country'"=="USA" rename exitquestionsquestion# exitq# // must do this here because naming of Q1-Q4 and canceled differs in US version
	else {
		rename exitquestion#* exitq#
		rename canceledexplicitorauto canceled
		}
	drop timestamp*
	rename random randomID
	merge 1:1 randomID using `timestamps', assert(3) nogenerate
	gen Country = "`country'"
	append using `participations'
	save `participations', replace
	}

	
*** Translations of reasons

keep randomID shortID Country // generate complete map of shortID-->randomID for merging in translations
drop if mi(shortID)
save `shortIDs' // also needed for section IV: RA coding

clear
save `translations', emptyok
foreach country in Argentina Brazil China France Germany {
	import excel using `country'/reasons_translations.xlsx, firstrow allstring clear
	gen Country="`country'"
	append using `translations'
	save `translations', replace
	}
drop if mi(translation)
assert !mi(randomID) | !mi(shortID)
preserve
	drop if mi(random)
	save `translations', replace // this stores translations that already have randomID
restore
drop if !mi(random) // now focusing on those that do NOT have randomID yet
drop random
merge 1:1 Country shortID using `shortIDs', keep(3) assert(2 3) nogenerate
append using `translations'
isid randomID
drop shortID
save `translations', replace

use `participations', clear
merge 1:1 randomID using `translations'
drop _merge
assert mi(translation) if inlist(Country,"India","USA")
replace translation=judgmentreasons if inlist(Country,"India","USA")
rename (judgmentreasons translation) (jdg_rea_original judgmentreasons)
save `participations', replace


*** click/display + sequence data: how much time (minutes) a participant spent with each of the documents; plus sequence data for path analysis (to be completed at end of script)

gen double time = clock(timestampstep4,"YMD#hms#") // time when document views end (participant proceeds to judgment) -- this will serve to calculate length of last document view
gen double starttime = clock(timestampstep3,"YMD#hms#") // time when document views start (participant enters main document area) -- this will serve to check data integrity
keep randomID time starttime
gen document = "end"
save `endtime'

clear
save `docviews', emptyok
foreach country in Argentina Brazil China France Germany India USA {
	import delimited `country'/document_requests.csv, varnames(1) clear
	if !inlist("`country'","USA","India") drop if regexm(doc,"/en_US")  // non-English-speaking countries may have en_US docs (only) in trial runs
	append using `docviews'
	save `docviews', replace
	}
	
rename participationid randomID
foreach code in es_AR zh_CN de_DE fr_FR en_US { // remove codes for the country versions of the documents
	replace document = subinstr(document,"/`code'","",1) //  NB: de_DE was accidentally but inconsequentially attached to Portuguese version as well (Roland email 4/20/2018) 
	}
replace document = subinstr(document,"/documents/","",1)
replace document = "toc" if doc=="/documents"
replace document = "precedent" if regexm(doc,"precedent") // there are versions /2 and /3 in the data as well because of splitting of the long docs
replace document = "trialjudgment" if regexm(doc,"trial_judgment") // id.

gen double time = clock(timestamp,"YMD#hms#")
duplicates drop randomID time doc, force
duplicates report random time // 7 (3 from useable IDs, checked separately)
isid random time idof
bys random time (idof): keep if _n==_N // in case of duplicates by time, keep the higher ID of the document request = later click (this only concerns the 3 duplicates listed above)
drop idof timestamp

append using `endtime'
save `sequence' // this will be saved as sequence.dta at end of the script after dropping unuseable observations & checking consistency

bys random (time): gen t_ = time[_n+1]-time
assert t_==. if document=="end" // should be because nothing comes after "end" so time[_n+1] should be empty
drop if document=="end"
collapse (sum) t_, by(randomID document)
replace t_ = minutes(t_)
reshape wide t_, i(randomID) j(document) string
recode t_* (.=0) // see Importcode_addl_gap_analysis.do - many participants actually did not look at certain documents. I also spot-checked the raw data of 2 French participants with t_brief==0: jr2vhf9r9525qdaj881e8vilp1 and sn0jo0ob3kp2qu3ou2uvgvqft1
egen t_total = rowtotal(t_*)

merge 1:1 randomID using `participations' // not restricting to matched results of any kind to detect errors and extend all exclusion rules to all data
list random Country timestampstep* if _merge==1 // only i0cj1iighmtohe9j0tup47bnq4: very short German test run. Will disappear when dropping mi(time4) etc.
drop _merge // _merge==2 is possible because some participations may have started but not progressed to documents --> in participations.xlsx but not document_requests.csv 


/********************************************************************
	II. CLEANING VARIABLES (other than RA coding)
*********************************************************************/

* dropping variables that were in system only for student runs
foreach var in varlist Amazon exitq7 exitq9 exitq10 exitq11 { // only relevant for student participations, "cap" because may not be in the files anymore anyway
	cap drop `var'
	}
	
* renaming
rename (auto reasonf nationality precedent judgmentg primings anchor datausageallowed0withdrawn) /// NB: some vars like randomID have longer names, hence renaming
		(cancel_auto cancel_reason nat_ prec_  guilty prime anchor withdrawal)
	
gen BRA_IND_know_ICL 			= exitq20 if inlist(Country,"Brazil","India") // this question was only asked in India but Brazil software had a field for this too
gen BRA_IND_judge_federal_state = exitq21 if inlist(Country,"Brazil","India") // this question was only asked in Brazil but India software had a field for this too
gen BRA_IND_docket_criminal 	= exitq22 if inlist(Country,"Brazil","India") // id.
replace exitq20 = "" if inlist(Country,"Brazil","India")
replace exitq21 = "" if inlist(Country,"Brazil","India")
replace exitq22 = "" if inlist(Country,"Brazil","India")

rename exitq1  confidence
rename exitq2  sentence
rename exitq3  know_ICL
rename exitq4  recognize_
rename exitq5  ever_prosecutor
rename exitq6  ever_defender
rename exitq8  age_
rename exitq12 ARG_judge_prosecutor
rename exitq13 ARG_court_level
rename exitq14 ARG_court_subject
rename exitq15 CHN_court_level
rename exitq16 age_non_US
rename exitq17 CHN_court_division
rename exitq18 GER_IND_court_division
rename exitq19 GER_docket
rename exitq20 FRA_position
rename exitq21 FRA_appeals
rename exitq22 FRA_civ_crim
rename exitq23 FRA_ever_prosecutor
rename exitq24 FRA_ever_judge
rename exitq25 FRA_past_judge_appeal
rename exitq26 FRA_past_judge_civ_crim

label var ever_defender "asked only in US; can be shared only under confidentiality agreement"

* destring
destring canceled cancel_auto guilty prime anchor withdrawal, replace

* cancellations and withdrawals
replace canceled = 0 if cancel_reason=="paper [error - submitted judgment reasons on paper]"
replace cancel_reason = "" if cancel_reason=="paper [error - submitted judgment reasons on paper]"
recode withdrawal (0=1) (1=0) (.=0)

* treatment variables
label define nationality 1 "croatian" 2 "serbian"
label define precedent 1 "sainovic" 2 "vasiljevic" 3 "besic"
encode nat_, gen(def_nationality) label(nationality)
encode prec_, gen(precedent) label(precedent)
drop nat_ prec_
label define nationality 1 "sympathetic" 2 "unsympathetic", modify
label define precedent 1 "{it:Affirm}" 2 "{it:reverse}" 3 "{it:REVERSE}", modify // the italics correspond to use of italics in the paper's text -- omit otherwise

* time
local t=1
foreach step in start consent "instructions read" "documents finished" "judgment rendered" exit {
	gen double time`t' = clock(timestampstep`t',"YMD#hms#")
	format time`t' %tc
	label var time`t++' "time when clicked: `step'"
	}
drop timestampstep*

replace time1 = time2 if mi(time1) & !mi(time2) // participant 30slktinbfsrui688preqhte36 -- inconsequential computer glitch (Roland email 2017/11/24)

* confidence
destring confidence, replace ignore("%")
replace confidence = . if confidence>100
replace confidence = confidence * 100 if confidence<1 & confidence !=0

* sentence
list sentence if ustrregexm(sentence,"[.,]([0-9]+)") // only one ".30 ans" -- can safely ignore "." because in French, the period does not indicate digits smaller than 1, and numbers smaller than 1 are never given without a leading zero

gen years = sentence if Country=="USA" // US version had different instructions and asked and allowed only a number, not writing "years" etc. Verify in next two lines
assert !(ustrregexm(sentence,"a[nñ]o|an[s.,]|jahr|year|年", 1) | ustrregexm(sentence,"mes|mois|monat|month|月", 1)) if Country=="USA"
assert Country=="USA" if ustrregexm(sentence,"([0-9]+)") & !(ustrregexm(sentence,"a[nñ]o|an[s.,]|jahr|year|年", 1) | ustrregexm(sentence,"mes|mois|monat|month|月", 1))

replace years  = ustrregexs(1) if ustrregexm(sentence,"([0-9]+)[ ]*(a[nñ]o|an[s.,]|jahr|year|年)", 1)
replace years = ustrregexrf(years,"a[nñ]o|an[s.,]|jahr|year|年","",1)
destring years, replace
gen months = ustrregexs(1) if ustrregexm(sentence,"([0-9]+)[ ]*(mes|mois|monat|month|月)", 1)
replace months = ustrregexrf(months,"mes|mois|monat|month|月","",1)
destring months, replace

list sentence if mi(years) & mi(months) & (ustrregexm(sentence,"a[nñ]o|an[s.,]|jahr|year|年", 1) | ustrregexm(sentence,"mes|mois|monat|month|月", 1)) // 4 -- cf. next 3 lines
replace years = 3 if inlist(sentence,"drei Jahre","三年")
replace years = 2 if sentence == "dos años ( 2)"
replace years = 14 if sentence == "十四年"

list sentence years months if years*months!=0 & !mi(months,years) // 4 obs
replace months = 0 if months==years*12 & !mi(months,years)

recode months years (.=0) if !(mi(months) & mi(years)) // recode to avoid propagation of missing from months to years or vice versa, IF not both missing
replace years = years + months/12
drop sentence months
rename years sentence

* other exit questions (most differed slightly by country)
replace know_ICL = "yes" if inlist(BRA_IND_know,"very-well","well")
replace know_ICL = "no" if inlist(BRA_IND_know,"not-well","not-to-well")
label define yesno 0 "no" 1 "yes"
encode know_ICL, generate(knowICL) label(yesno)

encode recognize_, generate(recognize) label(yesno)

replace age_ = age_non_US if mi(age_) // NB: "age" was collected only in US but not disclosed here because of IRB privacy restrictions. We did not collect age data in ARG.
label define age 0 "under30" 1 "30-40" 2 "40-50" 3 "over50"
encode age_, generate(age) label(age)

gen prosecutor = ARG_judge_prosecutor=="prosecutor" | GER_IND_court_division=="Staatsanwaltschaft" | /// I eliminate prosecutors below, gen var. for possible future use only
				(FRA_position=="prosecutor" & FRA_ever_judge=="no") // this counts as judges former French judges who are now prosecutors

gen judge_appeals = ARG_court_level=="appeal" | CHN_court_level=="Supreme" | FRA_appeals=="yes" | GER_IND_court_division=="Oberlandesgericht" | ///
	(FRA_position=="prosecutor" & FRA_ever_judge=="yes" & FRA_past_judge_appeal=="yes") /// as noted above, we count former French judges as judges
	if !inlist(Country,"Brazil","USA") // we did not collect this data in Brazil and US to avoid identifiability, so set var to missing for these
replace judge_appeals = 0.5 if inlist(CHN_court_level,"Intermediate","Provincial") | /// these Chinese courts hear both trial and appeals cases
	inlist(GER_IND_court_division,"Landgericht","district-court") // id. for German LGs and Indian district cts.

gen judge_criminal = ARG_court_subject=="criminal" | CHN_court_division=="Criminal" | FRA_civ_crim=="criminal" | GER_docket=="criminal" | ///
	(FRA_position=="prosecutor" & FRA_ever_judge=="yes" & FRA_past_judge_civ_crim=="criminal")  ///
	if !inlist(Country,"Brazil","India","USA") // we did not collect this data in Brazil (glitch), India (don't know why), and US (identifiability), so set var to missing for these
replace judge_criminal = 0.5 if FRA_civ_crim=="both" | (FRA_position=="prosecutor" & FRA_ever_judge=="yes" & FRA_past_judge_civ_crim=="both") | GER_docket=="both"

replace ever_prosecutor = FRA_ever_prosecutor if FRA_position=="judge" // NB: US data are missing in public version of data due to IRB restrictions
replace ever_prosecutor = "yes" if FRA_position=="prosecutor" & FRA_ever_judge=="yes"
encode ever_prosecutor, generate(everprosecutor) label(yesno)

assert mi(BRA_IND_judge_federal_state) // this and next weren't collected due to software glitch
assert mi(BRA_IND_docket_criminal)

drop age_* know_ICL recognize_ ever_prosecutor BRA_IND_know BRA_IND_judge BRA_IND_docket ARG* CHN* GER* FRA_civ FRA_app FRA_past* FRA_ever_p // keeping FRA_position and FRA_ever_judge for filtering below

* generate convenience variables
gen duration=minutes(time5-time2)
gen conflict = ((precedent==1)+(def_nationality=="Sympathetic":nationality))!=1 // either prec=Affirm-->affirm AND def=Croat=nice, or neither
label var conflict "precedent and emotions push in opposite directions"

* encoding
encode Country, gen(country) label(country)
drop Country

save `participations', replace


/********************************************************************
	III. FILTERING USEABLE PARTICIPATIONS (other than refusal -- see RA coding below)
*********************************************************************/

* NB: some filters (e.g., based on time) are possibly redundant, i.e., already covered by other filters or empty set

*** a) Clear cases of not useable, and no further interest

* explicit trials and other non-participants or non-judges
drop if ustrregexm(judgmentreasons,"^(test|trial|holger)",1)
drop if judgmentreasons=="This is a trial"
drop if regexm(judgmentreasons,"(Denise Neary, FJC)") // Denise Neary was the US organizer and participated for fun. She is not a judge
drop if prosecutor | (FRA_position=="other" & FRA_ever_judge=="no")
drop prosecutor FRA_position FRA_ever_judge
drop if country=="Brazil":country & inlist(shortID,"QHL", "EPB", "V5U") // three non-judges in the room participated -- we know their IDs because we asked them ex post to pick out the reasons they wrote
drop if country=="China":country & randomID=="hhlairgpsgq8n9h9eus7nlefp5" // worked with English docs -- probably preview participation, in any event unusable (missing time4 anyway)

* timing (this serves to filter out test participations and, in China, failed attempts of different recruiting
drop if country=="Argentina":country & !(inrange(time1,clock("2015-11-26 15:55","YMD hm"),clock("2015-11-26 16:10","YMD hm")) | /// first conference BA 11/26/2015, 2 rounds:
										 inrange(time1,clock("2015-11-26 17:15","YMD hm"),clock("2015-11-26 17:25","YMD hm")) | ///      1pm and 2pm local (GMT-3) plus/minus
										 inrange(time1,clock("2016-03-30 19:25","YMD hm"),clock("2016-03-30 19:45","YMD hm"))) // second conference (San Isidro)
drop if country=="Brazil":country &		!inrange(time1,clock("2018-04-19 13:00","YMD hm"),clock("2018-04-19 13:30","YMD hm")) // session started 2018/4/19 at 9am EST-DST = 13:00 GMT, but participants trickled in slowly so study didn't start until after 13:15 and still there were latecomers
drop if country=="China":country &		!inlist(dofc(time1),date("2016-04-12", "YMD"),date("2016-04-18", "YMD")) // April 12 and 18 are the dates Zhuang had judges in the seminar room -- in class on the 12th, after class on the 18th
assert inrange(time1,clock("2017-11-15 7:55","YMD hm"),clock("2017-11-15 8:15","YMD hm")) if country=="France":country // session 2017/11/15 9am CET = 8am GMT
drop if country=="Germany":country &	!(inrange(time1,clock("2017-09-06 18:00","YMD hm"),clock("2017-09-06 18:15","YMD hm")) | /// Trier 1
										 inrange(time1,clock("2017-10-25 18:00","YMD hm"),clock("2017-10-25 18:15","YMD hm")) | /// Trier 2  
										 inrange(time1,clock("2017-11-22 19:00","YMD hm"),clock("2017-11-22 19:16","YMD hm")) | /// Trier 3
										 inrange(time1,clock("2018-06-04 17:30","YMD hm"),clock("2018-06-04 17:45","YMD hm"))) //   Nenndorf
drop if country=="India":country &		!inrange(time1,clock("2018-02-25 07:15","YMD hm"),clock("2018-02-25 07:45","YMD hm"))
assert inrange(time1,clock("2015-04-14 14:10:00","YMD hms"),clock("2015-04-14 15:15:00","YMD hms")) if country=="USA":country //session started 2015/4/14 at 10:15am EST-DST = 14:15 GMT.  This is only for filtering tests: for iPad crash issues, see below
list country time1-time4 duration if duration<1.5 // [2 Chinese, 1 Argentinians--> nothing to do with iPad/computer crashes (see below).] These are really short (<90 seconds) ...
assert mi(judgmentreasons) if duration<1.5 // ... AND don't have reasons --> safe to assume they were tests
drop if duration<1.5

* computer freezes (investigation only -- determine the earliest time a participant could have been transferred after viewing treatments and hence being contaminated --> exclude later starts below)
sort time1
list time1-time4 if country=="USA":country & mi(time5) // mi(time5) (i.e., not formally finished judgment) is the maximum extent of freeze issues -- they needn't be freezes, but they could be. However, all time4's here are late, so even they did freeze at that point, they would be excluded anyway under rules below (which work off of the earliest time3)
list time1-time4 if country=="Germany":country & mi(time5) & dofc(time1)==date("25Oct2017","DMY") // in Germany, computer freezes only occured during the 25 Oct 2017 round. Same finding on time4's as in US.
sort time3
list randomID time1-time3 if country=="USA":country & mi(time4) // *#1 (gtt303tfn51p41tqnp02okvl05) viewed documents for 40 more minutes (according to document_visibility.csv and document_requests.csv for US). #2 (iu4kg849ep9rv78j26n951g2b5) did not view any documents after time3
sum time3 if randomID=="iu4kg849ep9rv78j26n951g2b5"
scalar mintime3USfreeze = r(min)
list randomID time1-time3 if country=="Germany":country & mi(time4) & dofc(time1)==date("25Oct2017","DMY")  // #1 (bev1ght5eaqmnminar3o41vac1) generated last document request at 18:08:07 and last view at 18:09:01, i.e., before #2 (lfk94eaq859b85bbbclpqm5vi5) reached time3 -- so 18:09:01 is earliest possible freeze
scalar mintime3DEfreeze = clock("2017-10-25 18:09:01","YMD hms")

* did not complete consent
misstable pattern time1-time5 // should be, and is, diagonal: time[x] can be missing only if time[x+1] is missing
drop if mi(time1) // this would be a computer where nobody even hit the "start" button -- so presumably nobody even touched it after we set up the machine
drop if mi(time2) // these are people who did not complete consent
tab country // "raw" participant data -- real judge participants irrespective of whether they finished

* did not proceed from instructions stage
drop if mi(time3)
tab country // "semi-raw" participant data -- real judge participants who at least looked at the main experimental materials

* missing crucial data (more stringent criteria implemented only later, in case one wants to keep borderline cases)
gen attrition = mi(time4, time5, guilty) // to test for differential attrition by ...
tab attrition precedent, exact // ... precedent
tabstat attrition, by(precedent)
tab attrition def_nat, exact // ... defendant
prtest attrition, by(def_nat)
egen treatmentgroup = group(precedent def_nat)
tab attrition treatmentgroup, exact // ... interaction of precedent and defendant
table precedent def_nat, contents(mean attrition sum attrition freq)
drop attrition treatmentgroup
drop if mi(time4) // time4 marks transition from documents to judgment stage -- if this is missing, participation can't have judgment or assurance of complete path
drop if mi(time5) // time5 is missing if the participant did not proceed from judgment screen, in which case the system might erroneously have recorded "guilty=0" as a default value (Roland email 8/26/2019)
assert !mi(guilty) // should be empty set because we already filtered on mi(time5). Also wouldn't show up in experimental tests anyway for lack of outcome variable. But would filter not serious observations for path analysis 
tab country // "finished" participations including dirty ones (see b below)

*** b) Borderline cases of not useable, or further interest in analysis

collect_p "no_drops"

* not allowed to use
tab country if withdrawal==1
drop if withdraw==1 // 4 obs.
drop withdrawal
collect_p "withdrawal"

* computer freezes
drop if country=="USA":country & !inrange(time1,clock("2015-04-14 14:10:00","YMD hms"),mintime3USfreeze) // US iPad crash issue, cf. Spamann & Kloehn 2016 footnote 10, who only did this as a successful robustness check (same with one refusal stating in reasons "not enough time to form a judgment"), such that they had three more observations (based on time exclusion: c2jeqoc8ph1ghbbrg9cle6h506, o8s56qc94rh5vtln3ec06rhbu4; refusal: vcqjdqoa30p309cidlmlu9ua41).
drop if country=="Germany":country & dofc(time1)==date("25Oct2017","DMY") & time1>mintime3DEfreeze
collect_p "computer_freezes"

* duration -- suspiciously short Chinese observations
list random country duration t_total time3 time4 judgmentreasons if duration<10 | t_total<5 // we already dropped duration<1.5 above after verifying that they did not have judgmentreasons
drop if duration<5 | t_total<3 // threshold from previous line would give same result.
collect_p "short_Chinese"

* aborted unless in clear error
assert cancel_reason=="" // Had we not filtered on mi(time5) there'd be two German judges who thought it was irresponsible to decide in such a short time
assert cancel_auto!=1
drop cancel_reason cancel_auto
drop if canceled & mi(judgmentreasons) // none. If there are judgment reasons, then cancellation was probably in error
list judgmentreasons if canceled // they read serious; 3 such cases in data

save `participations', replace


/********************************************************************
	IV. RA CODING OF REASONS
*********************************************************************/
* NB: this was delayed because filtering to (preliminary measure of) useable observation focuses the intercoder reliability analysis

* creating link file to add randomID to observations only identified by shortID
use shortID randomID using `shortIDs', clear
drop if random=="7kpastk3khm1rlc3othij8dcb3" // this is one of two randomID with shortID TG9 -- this randomID was used by RAs, short was used for other obs
save `shortIDs', replace

* ingesting RAs' csv files
qui {
foreach RA in Johne Tyler Julian Oliver {

	import excel RA_coding\Case_coding_`RA'.xlsx, sheet("Sheet1") clear
	drop in 1/2 // variable types and names in Excel files
	drop if B=="" // since B is the reason text, those would have to be empty lines -- should be automatic in "import excel" but just in case
	if "`RA'" == "Johne" {
		keep A-S
		destring C-E G-L N-P R, ignore("-") replace
		rename (A-S) (randomID text	refusal	reservations misunderstanding misund_notes factfinding factf_v_TrialC spec_dir subst_help ///
			prec_mentioned_gen prec_mentioned_spec	prec_cited_spec	prec_distinguish policy	statute	stat_int_method	irrelevantfacts	notes)
		}
	else {
		keep A-R
		destring C-E G-K M-O Q, ignore("-") replace
		rename (A-R) (randomID text	refusal	reservations misunderstanding misund_notes factfinding factf_v_TrialC factf_v_TC_specdir ///
			prec_mentioned_gen prec_mentioned_spec	prec_cited_spec	prec_distinguish policy	statute	stat_int_method	irrelevantfacts	notes)
		}
	rename (refusal-notes) =_`RA'
	
	* clearning up some dirty IDs
	replace randomID = subinstr(randomID,".","",.)
	replace randomID = subinstr(randomID,",","",.)
	replace randomID = subinstr(randomID,"，","",.)
	replace randomID = subinstr(randomID,`"""',"",.)
	replace randomID = strtrim(randomID)
	replace randomID = "8e04m85ts29ibr360qmmfo5044" if randomID=="8e04m85ts29ibr360qmmfo50441" // misspelling
	replace randomID = "30slktinbfsrui688preqhte36" if randomID=="no ID available" // matched by comparison of judgment reasons
	replace randomID = "cjhvqrvu78n7g8mpkcpf9l3b45" if randomID=="[missing]" // German participant in Trier 3: in this round, only participant cjhvqrvu78n7g8mpkcpf9l3b45 did not submit electronic reasons but did submit judgment 
	count if randomID=="WHW"
	assert r(N)<=1
	if r(N)==1 drop if randomID=="l064grfs0uiekdgh29tvmce3d2" // this is the long ID corresponding to WHW. WHW contains real judgment text (paper); l064grfs0uiekdgh29tvmce3d2 only placeholder (electronic)
	
	* add randomID where it is missing (in RAs' files, some "randomIDs" were in reality short IDs)
	gen shortID = randomID
	replace randomID = "" if strlen(randomID)<5
	merge 1:1 shortID using `shortIDs', keepusing(randomID) update keep(1 3 4 5) nogenerate // adds missing randomIDs
	drop shortID
	
	* interpretation method
	gen interp_text_`RA'	= regexm(stat_int_method,"[Tt]ext|[Pp]lain")
	gen interp_purpose_`RA'	= regexm(stat_int_method,"[Pp]urpos|[Ii]ntent")
	gen interp_system_`RA'	= regexm(stat_int_method,"[Ss]ystem")

	cap merge 1:1 randomID using `RAcoding', nogenerate // adding to previously ingested RAs -- "cap" because in first iteration of loop the using file doesn't exist yet
	save `RAcoding', replace
	}
	}
merge 1:1 random using `participations', keepusing(random) keep(3) nogenerate // subsetting RA work to observations that are useable according to III.
recode prec_mentioned_*_Julian (2 = 1) // this RA counted precedents mentioned (2 for one participant) rather than apply binary 0/1 coding

* refusals
egen refusal = rowtotal(refusal_*)
egen refusal_coders = rownonmiss(refusal_*)
assert inlist(refusal,0, refusal_coders) // this confirms that all four coders agreed on whether a participant refused participation
recode refusal (1/max = 1)
drop refusal_*

* intercoder reliability; generating one joint score
local vars "reservations misunderstanding factfinding factf_v_TrialC prec_mentioned_gen prec_mentioned_spec	prec_distinguish policy	statute	irrelevantfacts interp_text interp_purpose interp_system"
foreach var in `vars' {
	di _newline "`var':"
	kap `var'* if refusal==0 // between "slight" (misunderstanding" and "almost perfect" ...
	egen `var' = rowtotal(`var'_*) if refusal==0
	tab `var'
	}
list misunderstanding_* if inrange(misunderstanding,1,4), abb(20) // Julian (8), Taylor (6), Oliver (4), Johne (2)
keep `vars' refusal random

* merge with main data and clean up
merge 1:1 random using `participations', assert(2 3) nogenerate // _merge==2 are participants who finished and passed exclusion criteria in III. but did not write judgment reasons. _merge==1 impossible due to prior subsetting
list country judgmentreasons if refusal==1
drop if refusal==1
drop refusal
collect_p "refusal"
tab misunderstanding
list country judgmentreasons if misunderstanding>1 & !mi(misunderstanding)
drop if misunderstanding>1 & !mi(misunderstanding)
collect_p "misunderstanding"

*** correcting erroneous affirmed/reverse entries
preserve
import excel "RA_coding\guilty_inferred_from_reasons_Johne.xlsx", sheet("Sheet1") cellrange(A1:C1001) firstrow clear
keep if !mi(ID)
keep ID Affirmed
rename (ID Affirmed) (randomID corrected)
tempfile corrected
save `corrected'
restore
merge 1:1 randomID using `corrected', nogenerate assert(2 3) keep(3)
tab corrected guilty
destring corrected, ignore(?) replace
clonevar guilty_raw = guilty
replace guilty = corrected if !mi(corrected)
collect_p "corrected"

/********************************************************************
	V. FINISH MAIN DATA
*********************************************************************/
order randomID shortID country cancel* def_nationality precedent prime anchor time* t_* duration guilty sentence judgmentreasons jdg_rea_original
compress
save maindata.dta, replace

frame change pvalues
gen droporder = ceil(_n/2)
save effect_of_cleaning_on_p.dta, replace
scatter p droporder if test == "nationality" || scatter p droporder if test == "precedent", mlabel(after) legend(label(1 "nationality") label(2 "precedent"))

/********************************************************************
	VI. SEQUENCE DATA
********************************************************************/
use `sequence', clear
merge m:1 randomID using maindata, keepusing(country) assert(1 3) keep(3) nogenerate // get rid of unuseable participations, add country indicator
assert time-starttime<(1000*60*60) if !mi(starttime) // since time is in ms, 1000ms * 60s * 60 = 60 min -- no participation should be longer given 55 min cap
drop starttime

isid random time // time is tc calendar time (ms)
replace document = "briefs" if regexm(doc,"brief") // treating appellant's and respondent's briefs as interchangeable
replace document = "facts" if document == "statement_of_facts" // for easier display in graphs

* checking if there are participants with long stretches of viewing toc -- if not, we can drop them to make computation etc. easier
bys random (time): gen pos = _n // in which order did the click occur
by  random (time): gen length = time[_n+1] - time // how long did it stay live
sort length

tab doc if pos == 1 // only toc
sum length if pos>1 & doc=="toc" // there are 11 views of toc after start (pos>1), mean 13 seconds, longest 26 seconds -- ignorable
assert r(max)<27000
assert r(N)/_N<.005
drop if doc=="toc"

* instruction views: chop off at the start
tab doc [iw=length] // 1.54% instructions
bys random (time): drop if _n==1 & doc=="instructions" // chop off instruction views in the beginning ...
while r(N_drop)>0 {
	bys random (time): drop if _n==1 & doc=="instructions" // ... and keep doing this until there are none left
	}
tab doc [iw=length] // 0.81% instructions

* eventtime (including more chopping off of instructions)
scalar totalclicks = _N
gen int eventtime = . // starting off with missing values makes it possible to do the loop below
local drop = 1
while `drop'!=0 { // chop off instructions at eventtime==1 until there aren't any anymore even after dropping duplicates within that event window 1
	bys random (time): replace eventtime = round(500*(time - time[1])/(time[_N]-time[1])) + 1 //  normalized & discretized to 500 periods (oma crashes with >500 sequence elements) (NB: t=501 is "end", see below)
	bys random eventtime (time): drop if eventtime==1 & _n<_N // keep later click in the first event window
	drop if eventtime==1 & doc=="instructions" // chop off initial view if it is instructions (we already did this but rounding & keeping last could have re-created)
	local drop = r(N_drop)
	}
bys random eventtime (time): keep if _n==_N // now keep later observation in any event window (NB: these are click data and hence sparse, so it is important to preserve the later click, which will probably stay live for a while)
assert _N>0.95*totalclicks // confirm we have only removed a small fraction of clicks during cleaning

* dropping "end" markers
assert document=="end" if eventtime==501
drop if eventtime == 501

* encoding
encode random, gen(ID)
encode document, gen(doc)
drop random document time pos length

* fill the time space with document then still live
xtset ID eventtime
tsfill, full
bys ID: replace country = country[1] if mi(country)
bys ID: replace doc = doc[_n-1] if mi(doc) // same as above - since these are click data, the right approach is to fill with earlier doc until next click comes

compress
save sequence.dta, replace


/********************************************************************
	APPENDIX: Creating summary statistics of US demographics -- only possible with access to US data before de-personalization, which requires confidentiality agreement
/*********************************************************************/
* IF YOU HAVE ACCESS TO THE DATA, CHANGE THE PATH IN THE NEXT LINE TO WHERE YOU HAVE STORED IT
import delimited "C:\Users\Holger Spamann\OneDrive - Harvard University\Documents\Projects\In_progress\experiment_results\Harvard\Judges_participations_2015-04-15T06-56-03+00-00_decrypted.csv", encoding(UTF-8) bindquote(strict) clear
rename random randomID
merge 1:1 random using maindata.dta, keepusing(random) keep(3) nogenerate // this retains only the useable participations -- see comments in maindata_and_sequence_creator.do
rename exitquestionsquestion#* exitq#
rename (exitq5 exitq6 exitq8 exitq9) (ever_prosecutor ever_defender age gender) // NB: we did not ask court level (appellate?) in US; "criminal" makes no sense in US anyway
cap label define yesno 0 "no" 1 "yes"
encode ever_prosecutor, generate(everprosecutor) label(yesno)
encode ever_defender, generate(everdefender) label(yesno)
tab age, gen(age)
tab gender, gen(gender)
drop age gender
foreach v of varlist age* gender* {
	local l`v' : variable label `v'
	}
collapse (mean) age* gender* everp everd
foreach v of varlist age* gender* {
	label var `v' "`l`v''"
	rename `v' `v'_US
	}
gen byte country="USA":country
save USdemographics, replace